In [129]:
#solrpy library: http://pythonhosted.org/solrpy/overview.html
import solr #to install: pip install solrpy
#pandas library for data processing - only needed to index the solr core, can be removed otherwise
import pandas as pd #to install: pip install pandas
#scikit-optimize library: https://github.com/scikit-optimize
import skopt #to install: pip install scikit-optimize
In [2]:
#Settings
# The files below are in the root folder of this GitHub repo. Launch jupyter notebook from that folder
# in order to read these files: 'jupyter notebook'
# Note: this is an artificial set of jobs, these are not real jobs, but are representative of our data
# Job descriptions are omitted, but usually we search that field also
jobs_data_file = "jobs.csv"
# File of relevancy judgements - these are highly subjective judgements, please don't take them too seriously
relevancy_file = "relevancy_judegements.csv"
#solr url and core (Jobs)
solr_url = "http://localhost:8983/solr/Jobs"
In [3]:
# Note: You can skip this section if you were able to load the Solr Jobs Core along with the data directory from the
# './Solr Core and Config' sub-folder. Older versions of Solr won't read this data, so here's some code to populate
# the index from the jobs.csv file
jobs_df = pd.read_csv(jobs_data_file, sep=",")
jobs_df["jobSkills"] = jobs_df["jobSkills"].apply(lambda sk: sk.split("|"))
# assign a unique doc id to each row
jobs_df["id"] = range(len(jobs_df))
jobs_df.head(5)
Out[3]:
In [4]:
solr_connection = solr.Solr(solr_url, persistent=True, timeout=360, max_retries=5)
# convert dataframe to a list of dictionaries (required solr client library document format)
docs = jobs_df.T.to_dict().values()
#wipe out any existing documents if present
solr_connection.delete_query("*:*")
# send documents
solr_connection.add_many(docs)
# hard commit and optimize
solr_connection.commit()
solr_connection.optimize()
Out[4]:
In [130]:
# The 'relevant' column is a list of document id's (the id field from the schema) that were both in the set of the top
# 20 returned documents, and were subjectively judged as relevant to the original
# query. We can subsequently use these to derive a MAP score for a given query
rel_df = pd.read_csv(relevancy_file, sep="|", converters={"fq": str, "location": str})
searches = rel_df.T.to_dict()
rel_df.head(3)
Out[130]:
In [154]:
# Takes a search id and a qf setting, and returns the list of doc ids,
def get_results_for_search(sid, qf_value, rows):
search = searches[sid]
fq = ""
pt = "0,0"
if not search["location"].strip() == "" :
splt = filter(lambda s: "pt=" in s, search["fq"].split("&"))
if splt:
pt = splt[0].replace("pt=","")
fq = "{!geofilt}"
resp = solr_connection.select(
q=search["query"],
fields="id",
start=0, rows=rows,
qf=qf_value, # comes from get_solr_params
fq=fq,
sfield="geoCode",
pt=pt,
score=False,
d="48.00", wt="json")
predicted = list(map(lambda res: res["id"], resp.results))
# return predicted doc ids, along with relevent ones (for IR metric)
return predicted, list(map(int, search["relevant"].split(",")))
In [246]:
def apk(actual, predicted, k=10):
"""
Computes the average precision at k.
This function computes the average prescision at k between two lists of
items.
Parameters
----------
actual : set
A set of elements that are to be predicted (order doesn't matter)
predicted : list
A list of predicted elements (order does matter)
k : int, optional
The maximum number of predicted elements
Returns
-------
score : double
The average precision at k over the input lists
"""
if len(predicted)>k:
predicted = predicted[:k]
score = 0.0
num_hits = 0.0
for i,p in enumerate(predicted):
if p in actual and p not in predicted[:i]:
num_hits += 1.0
score += num_hits / (i+1.0)
if not actual:
return 0.0
return score / min(len(actual), k)
def mean_average_precision_at_k(actual, predicted, k=10):
"""
Computes the mean average precision at k.
This function computes the mean average prescision at k between two lists
of lists of items.
Parameters
----------
actual : list
A list of sets of elements that are to be predicted
(order doesn't matter in the lists)
predicted : list
A list of lists of predicted elements
(order matters in the lists)
k : int, optional
The maximum number of predicted elements
Returns
-------
score : double
The mean average precision at k over the input lists
"""
return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])
def average_ndcg_at_k(actual, predicted, k, method=0):
vals = [ ndcg_at_k(act, pred, k, method) for act, pred in zip(actual, predicted)]
return np.mean(vals)
def ndcg_at_k(actual, predicted, k, method=0):
# convert to ratings - actual relevant results give rating of 10, vs 1 for the rest
act_hash = set(actual)
best_ratings = [ 10 for docid in actual ] + [1 for i in range(0, len(predicted) - len(actual))]
pred_ratings = [ 10 if docid in act_hash else 1 for docid in predicted ]
dcg_max = dcg_at_k(best_ratings, k, method)
if not dcg_max:
return 0.0
dcg = dcg_at_k(pred_ratings, k, method)
return dcg / dcg_max
def dcg_at_k(r, k, method=0):
"""
Code taken from: https://gist.github.com/bwhite/3726239
Score is discounted cumulative gain (dcg)
Relevance is positive real values. Can use binary
as the previous methods.
Example from
http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf
>>> r = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0]
>>> dcg_at_k(r, 1)
3.0
>>> dcg_at_k(r, 1, method=1)
3.0
>>> dcg_at_k(r, 2)
5.0
>>> dcg_at_k(r, 2, method=1)
4.2618595071429155
>>> dcg_at_k(r, 10)
9.6051177391888114
>>> dcg_at_k(r, 11)
9.6051177391888114
Args:
r: Relevance scores (list or numpy) in rank order
(first element is the first item)
k: Number of results to consider
method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...]
If 1 then weights are [1.0, 0.6309, 0.5, 0.4307, ...]
Returns:
Discounted cumulative gain
"""
r = np.asfarray(r)[:k]
if r.size:
if method == 0:
return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
elif method == 1:
return np.sum(r / np.log2(np.arange(2, r.size + 2)))
else:
raise ValueError('method must be 0 or 1.')
return 0.
In [249]:
# Measure results for one set of qf settings
score = objective([3,1.5,1.1])
score # Score is negative, as scopt tries to minimize function output
Out[249]:
In [250]:
# Function takes a list of 12 real numbers, and returns a set of solr configuration options
def get_solr_params(params):
return {"qf" : "employer^{0} jobTitle^{1} jobskills^{2}".format(*params[0:3])
#"pf2" : "employer^{0} jobTitle^{1} jobSkills^{2}".format(*params[3:6]),
#"pf" : "employer^{0} jobTitle^{1} jobSkills^{2}".format(*params[6:9])
}
In [270]:
# spit into training and test set of queries
sids = list(searches.keys())
cutoff = int(0.75* len(sids))
train_sids, test_sids = sids[:cutoff], sids[cutoff:]
train_sids, test_sids
Out[270]:
In [271]:
# Precision cut off
PREC_AT = 20
# Black box objective function to minimize
# This is for the training data
def objective(params):
# map list of numbers into solr parameters (just qf in this case)
additional_params = get_solr_params(params)
predicted, actual =[],[]
for sid in train_sids:
pred, act = get_results_for_search(sid, additional_params["qf"], PREC_AT)
predicted.append(pred)
actual.append(act)
# Compute Mean average precision at 20
return -1.0 * mean_average_precision_at_k(actual, predicted, PREC_AT)
# Can also use NDCG - the version above is tailored for binary judegements
#return -1.0 * average_ndcg_at_k(actual, predicted, PREC_AT)
# This is for the test data (held out dataset)
def evaluate(params):
# map list of numbers into solr parameters (just qf in this case)
additional_params = get_solr_params(params)
predicted, actual =[],[]
for sid in test_sids:
pred, act = get_results_for_search(sid, additional_params["qf"], PREC_AT)
predicted.append(pred)
actual.append(act)
# Compute Mean average precision at 20
return -1.0 * mean_average_precision_at_k(actual, predicted, PREC_AT)
In [257]:
# Example of how black box function is called to measure value of parameters (qf settings in this case)
score = objective([3, 2.5, 1.5])
# Score is negative as -1 * (IR metric), and the skopt library tries to find the parameters to minimize the score
score
Out[257]:
In [267]:
# simple call back function to print progress while optimizing
def callback(res):
call_no = len(res.func_vals)
current_fun = res.func_vals[-1]
print str(call_no).ljust(5) + "\t" + \
str(-1.0* current_fun).ljust(20) + "\t" + str(map(lambda d: round(d,3), res.x_iters[-1]))
The code below runs the sci-kit optimization library and tries to find the set of parameters that minimize the objective function above. We are choosing to map the parameter values to qf values (field boosts), but you can in theory try any configuration setting here that you can test in this way. Some settings, such as changing the config files themselves can be accomplished with a core reload, or in some cases a server restart. Note however that you need the algorithm to run for quite a few iterations to learn effectively from your data, and for some problems, it may not be able to find a near optimal solution.
In [272]:
from skopt import gbrt_minimize
import datetime
ITERATIONS = 100 # probably want this to be high, 500 calls or more, set to a small value greater than 10 to test it is working
min_val, max_val = 0.0, 50.0
# min and max for each possible qf value (we read 3 in get_solr_params currently)
space = [(min_val, max_val) for i in range(3)]
start = datetime.datetime.now()
print "Starting at ", start
print "Run","\t", "Current MAP", "\t\t", "Parameters"
# run optimizer, which will try to minimize the objective function
res = gbrt_minimize(objective, # the function to minimize
space, # the bounds on each dimension of x
acq="LCB", # controls how it searches for parameters
n_calls=ITERATIONS,# the number of evaluations of f including at x0
random_state=777, # set to a fixed number if you want this to be deterministic
n_jobs=-1, # how many threads (or really python processes due to GIL)
callback=callback)
end = datetime.datetime.now()
The evaluate function below is the same as the objective function, except it tests our newly optimized set of parameters on a different set of queries. This gives a more accurate measure of the performance of the new settings on data points and queries that were not in the training dataset.
In [273]:
# res.fun - function IR metric score (* -1), res.x - the best performing parameters
test_score = evaluate(res.x)
test_score
Out[273]:
The results from the training here are much higher than the test set. This is typical for a lot of machine learning \ optimization problems. If tuning an existing solr installation, you will want to ensure that the IR metrics score on the test set is better than the current production settings before releasing to production.
In [276]:
print("IR Metric @" + str(PREC_AT) + " Training Data = " + str(-1 * res.fun))
print("IR Metric @" + str(PREC_AT) + " Test Data = " + str(-1 * test_score))
print("\nParameters:\n\t"),
print get_solr_params(res.x)["qf"]
print "\ngbrt_minimize took", (end - start).total_seconds(), "secs"